{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Constructing control variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import os\n",
    "import json\n",
    "import numpy as np\n",
    "import time\n",
    "import random\n",
    "from json.decoder import JSONDecodeError"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('anti_vax.json') as f:\n",
    "    anti_twts = json.load(f)\n",
    "with open('pro_vax.json') as f:\n",
    "    pro_twts = json.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "def ExcludeTwtKeywords(twts, target_keywords): \n",
    "    result_twts = list()\n",
    "    for twt in twts: \n",
    "        if sum([(k in target_keywords) for k in twt['keywords']]) < len(twt['keywords']):\n",
    "            result_twts.append(twt)\n",
    "    return result_twts\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "anti_twts_cleaned = ExcludeTwtKeywords(anti_twts, ['plot','plots','viz','vis'])\n",
    "pro_twts_cleaned = ExcludeTwtKeywords(pro_twts, ['plot','plots','viz','vis'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Length of the tweet (in chars)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def getTwtTextLen(twts):\n",
    "    twt_len = dict()\n",
    "    for twt in twts: \n",
    "        text_len = len(twt['text'])\n",
    "        twt_id = twt['id']\n",
    "        twt_len[twt_id] = text_len\n",
    "    return twt_len\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "anti_twts_lens = getTwtTextLen(anti_twts_cleaned)\n",
    "pro_twts_lens = getTwtTextLen(pro_twts_cleaned)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Number of followers and number of tweets of an account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "def bearer_oauth(r):\n",
    "    \"\"\"\n",
    "    Method required by bearer token authentication.\n",
    "    \"\"\"\n",
    "\n",
    "    r.headers[\"Authorization\"] = f\"Bearer {bearer_token}\"\n",
    "    r.headers[\"User-Agent\"] = \"v2FullArchiveSearchPython\"\n",
    "    return r\n",
    "\n",
    "\n",
    "def connect_to_endpoint(url, params):\n",
    "    response = requests.request(\"GET\", search_url, auth=bearer_oauth, params=params)\n",
    "    #print(response.status_code)\n",
    "    if response.status_code != 200:\n",
    "        raise Exception(response.status_code, response.text)\n",
    "    return response.json()\n",
    "\n",
    "# Read the bearer token from an environment variable\n",
    "bearer_token = os.environ.get(\"BEARER_TOKEN\")\n",
    "if not bearer_token:\n",
    "    raise RuntimeError(\"Missing BEARER_TOKEN environment variable.\")\n",
    "\n",
    "SEARCH_URL = \"https://api.twitter.com/2/tweets/search/all\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "def setQuery_params(user_ids):\n",
    "\n",
    "    query_params = {'ids': user_ids, \n",
    "                    'user.fields': 'public_metrics'\n",
    "                   }\n",
    "    return query_params"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_unique_user_ids(twts): \n",
    "    unique_ids = set()\n",
    "    for twt in twts: \n",
    "        unique_ids.add(twt['author_id'])\n",
    "    return list(unique_ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "def split_user_ids(unique_ids, n): \n",
    "    # Yield successive n-sized\n",
    "     \n",
    "    # looping till length l\n",
    "    for i in range(0, len(unique_ids), n):\n",
    "        yield unique_ids[i:i + n]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "def chunk_list_to_str(anti_unique_ids_chunks): \n",
    "    string_list = list()\n",
    "    for chunk in anti_unique_ids_chunks: \n",
    "        string = ''\n",
    "        for item in chunk:\n",
    "            if item == chunk[-1]: \n",
    "                string = string + item\n",
    "            else:\n",
    "                string = string + item + ','\n",
    "        string_list.append(string)\n",
    "    return string_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "anti_unique_ids = get_unique_user_ids(anti_twts_cleaned)\n",
    "anti_unique_ids_chunks = list(split_user_ids(anti_unique_ids, 100))\n",
    "anti_unique_ids_chunks_str = chunk_list_to_str(anti_unique_ids_chunks)\n",
    "\n",
    "pro_unique_ids = get_unique_user_ids(pro_twts_cleaned)\n",
    "pro_unique_ids_chunks = list(split_user_ids(pro_unique_ids, 100))\n",
    "pro_unique_ids_chunks_str = chunk_list_to_str(pro_unique_ids_chunks)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "anti_data = list()\n",
    "for chunk_str in anti_unique_ids_chunks_str: \n",
    "    query_params = setQuery_params(chunk_str)\n",
    "    while True: \n",
    "        try:\n",
    "            new_data = connect_to_endpoint(search_url, query_params)\n",
    "            anti_data = anti_data + new_data['data']\n",
    "            break\n",
    "        except Exception as e: \n",
    "            if e[0] == 429: #exceed rate limit\n",
    "                time.sleep(900)\n",
    "            else: \n",
    "                print(e)\n",
    "                break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "pro_data = list()\n",
    "for chunk_str in pro_unique_ids_chunks_str: \n",
    "    query_params = setQuery_params(chunk_str)\n",
    "    while True: \n",
    "        try:\n",
    "            new_data = connect_to_endpoint(search_url, query_params)\n",
    "            pro_data = pro_data + new_data['data']\n",
    "            break\n",
    "        except Exception as e: \n",
    "            if e[0] == 429: #exceed rate limit\n",
    "                time.sleep(900)\n",
    "            else: \n",
    "                print(e)\n",
    "                break"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Mapping the number of followers and tweets of an account to a tweet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "tweets = anti_twts_cleaned + pro_twts_cleaned\n",
    "\n",
    "anti_users = {str(u[\"id\"]): u for u in anti_data}\n",
    "pro_users = {str(u[\"id\"]): u for u in pro_data}\n",
    "\n",
    "with open(\"results/controls.csv\", \"w\", newline=\"\", encoding=\"utf-8\") as out:\n",
    "    writer = csv.writer(out)\n",
    "    writer.writerow([\"id\", \"following_count\", \"tweet_count\", \"type\", \"len\"])\n",
    "\n",
    "    for tweet in tweets:\n",
    "        tweet_id = str(tweet.get(\"id\", \"\"))\n",
    "        author_id = str(tweet.get(\"author_id\", \"\"))\n",
    "        text_len = len(tweet['text'])\n",
    "\n",
    "        if author_id in anti_users:\n",
    "            user = anti_users[author_id]\n",
    "            metrics = user.get(\"public_metrics\", {})\n",
    "            writer.writerow([\n",
    "                tweet_id,\n",
    "                metrics.get(\"following_count\", \"\"),\n",
    "                metrics.get(\"tweet_count\", \"\"),\n",
    "                \"anti\",\n",
    "                text_len\n",
    "            ])\n",
    "\n",
    "        elif author_id in pro_users:\n",
    "            user = pro_users[author_id]\n",
    "            metrics = user.get(\"public_metrics\", {})\n",
    "            writer.writerow([\n",
    "                tweet_id,\n",
    "                metrics.get(\"following_count\", \"\"),\n",
    "                metrics.get(\"tweet_count\", \"\"),\n",
    "                \"pro\",\n",
    "                text_len\n",
    "            ])\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:base] *",
   "language": "python",
   "name": "conda-base-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
